import pandas as pd
import jieba
import string
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 读取CSV文件
data = pd.read_csv('数据.csv')
# print(data.head)

# 假设文本列名为'text'，请根据实际情况修改
comments = data['review'].tolist()

# 分词并统计
words = []
chinese_punctuation = "、，。！？；：“”‘’（）《》【】——…"

for line in comments:
    word_list = jieba.lcut(str(line))
    words.extend([word for word in word_list if word.strip() and word not in string.punctuation + chinese_punctuation])
    
# 统计词频
word_counts = Counter(words)

# 输出前10个高频词
for word, count in word_counts.most_common(100):
    print(f"{word}: {count}")

# 生成词云
# 设置WordCloud参数
wordcloud = WordCloud(font_path='simhei.ttf', width=800, height=400, background_color='white')
# 根据词频生成词云图
wordcloud.generate_from_frequencies(word_counts)

# 可视化词云
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()